From 1d1c34f3c83e12d65a3f18b0b748ddd9b0040ddf Mon Sep 17 00:00:00 2001 From: =?utf8?q?=C3=98yvind=20Kol=C3=A5s?= Date: Sat, 19 Nov 2016 18:27:06 +0100 Subject: [PATCH] make pow implementations inlineable - gaining some fixed amount of blitting performance --- babl/base/pow-24.c | 3 +- babl/base/pow-24.h | 118 ++++++++++++++++++++++++++++++++++++++-- extensions/sse2-float.c | 14 ++--- 3 files changed, 123 insertions(+), 12 deletions(-) diff --git a/babl/base/pow-24.c b/babl/base/pow-24.c index 03fca43..dbc4071 100644 --- a/babl/base/pow-24.c +++ b/babl/base/pow-24.c @@ -18,7 +18,7 @@ #include #include "util.h" - +#if 0 /* a^b = exp(b*log(a)) * * Extracting the exponent from a float gives us an approximate log. @@ -125,3 +125,4 @@ babl_pow_1_24f (float x) y = (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y)); return x*y; } +#endif diff --git a/babl/base/pow-24.h b/babl/base/pow-24.h index 2fb338f..cbd6fd4 100644 --- a/babl/base/pow-24.h +++ b/babl/base/pow-24.h @@ -19,9 +19,119 @@ #ifndef _BASE_POW_24_H #define _BASE_POW_24_H -extern double babl_pow_1_24 (double x); -extern double babl_pow_24 (double x); -extern float babl_pow_1_24f (float x); -extern float babl_pow_24f (float x); +static double babl_pow_1_24 (double x); +static double babl_pow_24 (double x); +static float babl_pow_1_24f (float x); +static float babl_pow_24f (float x); + + +/* a^b = exp(b*log(a)) + * + * Extracting the exponent from a float gives us an approximate log. + * Or better yet, reinterpret the bitpattern of the whole float as an int. + * + * However, the output values of 12throot vary by less than a factor of 2 + * over the domain we care about, so we only get log() that way, not exp(). + * + * Approximate exp() with a low-degree polynomial; not exactly equal to the + * Taylor series since we're minimizing maximum error over a certain finite + * domain. It's not worthwhile to use lots of terms, since Newton's method + * has a better convergence rate once you get reasonably close to the answer. + */ +static inline double +init_newton (double x, double exponent, double c0, double c1, double c2) +{ + int iexp; + double y = frexp(x, &iexp); + y = 2*y+(iexp-2); + c1 *= M_LN2*exponent; + c2 *= M_LN2*M_LN2*exponent*exponent; + return y = c0 + c1*y + c2*y*y; +} + +/* Returns x^2.4 == (x*(x^(-1/5)))^3, using Newton's method for x^(-1/5). + */ +static inline double +babl_pow_24 (double x) +{ + double y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332); + int i; + for (i = 0; i < 3; i++) + y = (1.+1./5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y)); + x *= y; + return x*x*x; +} + +/* Returns x^(1/2.4) == x*((x^(-1/6))^(1/2))^7, using Newton's method for x^(-1/6). + */ +static inline double +babl_pow_1_24 (double x) +{ + double y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383); + int i; + double z; + x = sqrt (x); + /* newton's method for x^(-1/6) */ + z = (1./6.) * x; + for (i = 0; i < 3; i++) + y = (7./6.) * y - z * ((y*y)*(y*y)*(y*y*y)); + return x*y; +} + +////////////////////////////////////////////// +/* a^b = exp(b*log(a)) + * + * Extracting the exponent from a float gives us an approximate log. + * Or better yet, reinterpret the bitpattern of the whole float as an int. + * + * However, the output values of 12throot vary by less than a factor of 2 + * over the domain we care about, so we only get log() that way, not exp(). + * + * Approximate exp() with a low-degree polynomial; not exactly equal to the + * Taylor series since we're minimizing maximum error over a certain finite + * domain. It's not worthwhile to use lots of terms, since Newton's method + * has a better convergence rate once you get reasonably close to the answer. + */ +static inline float +init_newtonf (float x, float exponent, float c0, float c1, float c2) +{ + int iexp; + float y = frexpf(x, &iexp); + y = 2*y+(iexp-2); + c1 *= M_LN2*exponent; + c2 *= M_LN2*M_LN2*exponent*exponent; + return y = c0 + c1*y + c2*y*y; +} + +/* Returns x^2.4 == (x*(x^(-1/5)))^3, using Newton's method for x^(-1/5). + */ +static inline float +babl_pow_24f (float x) +{ + float y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f); + int i; + for (i = 0; i < 3; i++) + y = (1.f+1.f/5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y)); + x *= y; + return x*x*x; +} + +/* Returns x^(1/2.4) == x*((x^(-1/6))^(1/2))^7, using Newton's method for x^(-1/6). + */ +static inline float +babl_pow_1_24f (float x) +{ + float y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f); + int i; + float z; + x = sqrtf (x); + /* newton's method for x^(-1/6) */ + z = (1.f/6.f) * x; + for (i = 0; i < 3; i++) + y = (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y)); + return x*y; +} + + #endif diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c index 8148bf2..72463ee 100644 --- a/extensions/sse2-float.c +++ b/extensions/sse2-float.c @@ -244,7 +244,7 @@ conv_rgbAF_linear_rgbaF_linear_spin (const float *src, float *dst, long samples) #define FLT_MANTISSA (1<<23) static inline __v4sf -init_newton (__v4sf x, double exponent, double c0, double c1, double c2) +sse_init_newton (__v4sf x, double exponent, double c0, double c1, double c2) { double norm = exponent*M_LN2/FLT_MANTISSA; __v4sf y = _mm_cvtepi32_ps((__m128i)((__v4si)x - splat4i(FLT_ONE))); @@ -252,10 +252,10 @@ init_newton (__v4sf x, double exponent, double c0, double c1, double c2) } static inline __v4sf -pow_1_24 (__v4sf x) +sse_pow_1_24 (__v4sf x) { __v4sf y, z; - y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383); + y = sse_init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383); x = _mm_sqrt_ps (x); /* newton's method for x^(-1/6) */ z = splat4f (1.f/6.f) * x; @@ -265,10 +265,10 @@ pow_1_24 (__v4sf x) } static inline __v4sf -pow_24 (__v4sf x) +sse_pow_24 (__v4sf x) { __v4sf y, z; - y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332); + y = sse_init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332); /* newton's method for x^(-1/5) */ z = splat4f (1.f/5.f) * x; y = splat4f (6.f/5.f) * y - z * ((y*y*y)*(y*y*y)); @@ -280,7 +280,7 @@ pow_24 (__v4sf x) static inline __v4sf linear_to_gamma_2_2_sse2 (__v4sf x) { - __v4sf curve = pow_1_24 (x) * splat4f (1.055f) - splat4f (0.055f); + __v4sf curve = sse_pow_1_24 (x) * splat4f (1.055f) - splat4f (0.055f); __v4sf line = x * splat4f (12.92f); __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.003130804954f)); return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line)); @@ -289,7 +289,7 @@ linear_to_gamma_2_2_sse2 (__v4sf x) static inline __v4sf gamma_2_2_to_linear_sse2 (__v4sf x) { - __v4sf curve = pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f)); + __v4sf curve = sse_pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f)); __v4sf line = x * splat4f (1/12.92f); __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.04045f)); return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line)); -- 2.30.2